package com.softeem.spider;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 爬虫核心实现类
 */
public class Spider {

    /**目标服务器地址*/
    private String url;
    /**具体的正则表达式（匹配模式）*/
    private String regex;

    public Spider(String url, String regex) {
        this.url = url;
        this.regex = regex;
    }

    public List<String> getResourcesUrl() throws IOException {
        List<String> urls = new ArrayList<>();
        //读取网页源代码
        String html = HttpUtils.readAsHtml(url);
        //对正则表达式编译
        Pattern p = Pattern.compile(regex);
        //获取匹配器
        Matcher m = p.matcher(html);
        while(m.find()){
            //获取匹配组（第一组）：需要的资源地址
            String r = m.group();
            if(!r.startsWith("http") || !r.startsWith("https")){
                r = "https:" + r;
            }
            urls.add(r);
        }
        return urls;
    }


}
